{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import math\n",
    "\n",
    "from imblearn.over_sampling import SMOTE\n",
    "from imblearn.under_sampling import EditedNearestNeighbours\n",
    "\n",
    "from sklearn.ensemble import AdaBoostClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_accuracy(AL, y, verbose=1):\n",
    "    \n",
    "    try:\n",
    "        AL = np.array(AL)\n",
    "        y = np.array(y)\n",
    "\n",
    "        AL = AL.reshape(-1)\n",
    "        y = y.reshape(-1)\n",
    "\n",
    "        AL = AL > 0.5\n",
    "        AL = AL.astype(int)\n",
    "\n",
    "        y = y > 0.5\n",
    "        y = y.astype(int)\n",
    "\n",
    "        total = AL.shape[0]\n",
    "\n",
    "        TP = np.sum(np.logical_and(AL==1, y==1))\n",
    "        TN = np.sum(np.logical_and(AL==0, y==0))\n",
    "\n",
    "        FP = np.sum(np.logical_and(AL==1, y==0))\n",
    "        FN = np.sum(np.logical_and(AL==0, y==1))\n",
    "\n",
    "        P = TP / (TP + FP)\n",
    "        R = TP / (TP + FN)\n",
    "        F1 = (2 * P * R) / (P + R)\n",
    "\n",
    "\n",
    "        acc = np.sum(AL == y)/total\n",
    "\n",
    "\n",
    "        if verbose == 1:\n",
    "            print(\"\\nAccuracy: {} \\n\".format(acc))\n",
    "            print(\"True Positive: {} \\nTrue Negative: {}\\nFalse Positive: {} \\nFalse Negative: {}\\n\".format(TP, TN, FP, FN))\n",
    "            print(\"Precision: {} \\nRecall: {} \\nF1 Score: {}\\n\".format(P, R, F1))\n",
    "        \n",
    "        return acc\n",
    "    except:\n",
    "        return 0\n",
    "        \n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/warproxxx/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
      "of pandas will change to not sort by default.\n",
      "\n",
      "To accept the future behavior, pass 'sort=False'.\n",
      "\n",
      "To retain the current behavior and silence the warning, pass 'sort=True'.\n",
      "\n",
      "  if __name__ == '__main__':\n"
     ]
    }
   ],
   "source": [
    "bot_accounts = pd.concat([pd.read_csv('data/social_spambots_1.csv'), pd.read_csv('data/social_spambots_2.csv'), pd.read_csv('data/social_spambots_3.csv')]).reset_index(drop=True)\n",
    "clean_accounts = pd.read_csv('data/geniune_accounts.csv')\n",
    "\n",
    "requiredColumns = ['screen_name', 'created_at', 'updated', 'location', 'verified', 'statuses_count', 'friends_count','followers_count', 'favourites_count', 'default_profile_image', 'profile_use_background_image', 'protected', 'default_profile']\n",
    "bot_accounts = bot_accounts[requiredColumns]\n",
    "clean_accounts = clean_accounts[requiredColumns]\n",
    "\n",
    "def clean_df(df):\n",
    "    df['created_at'] = pd.to_datetime(df['created_at'])\n",
    "    df['updated'] = pd.to_datetime(df['updated'])\n",
    "    df['age'] = (df['updated'] - df['created_at']).astype('timedelta64[D]').astype(int)\n",
    "    df['has_location'] = df['location'].apply(lambda x: 0 if x==x else 1)\n",
    "    df['has_avatar'] = df['default_profile_image'].apply(lambda x: 1 if x==x else 0)\n",
    "    df['has_background'] = df['profile_use_background_image'].apply(lambda x: 1 if x==x else 0)\n",
    "    df['is_verified']=df['verified'].apply(lambda x: 1 if x==x else 0)\n",
    "    df['is_protected']=df['protected'].apply(lambda x: 1 if x==x else 0)\n",
    "    df['profile_modified'] = df['default_profile'].apply(lambda x: 0 if x==x else 1)\n",
    "    df = df.rename(index=str, columns={\"screen_name\": \"username\", \"statuses_count\": \"total_tweets\", \"friends_count\": \"total_following\", \"followers_count\": \"total_followers\", \"favourites_count\": \"total_likes\"})\n",
    "    return df[['username', 'age', 'has_location', 'is_verified', 'total_tweets', 'total_following', 'total_followers', 'total_likes', 'has_avatar', 'has_background', 'is_protected', 'profile_modified']]\n",
    "\n",
    "bot_accounts = clean_df(bot_accounts)\n",
    "clean_accounts = clean_df(clean_accounts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "bot_accounts['BotOrNot'] = 1\n",
    "clean_accounts['BotOrNot'] = 0\n",
    "\n",
    "combined_df = pd.concat([bot_accounts, clean_accounts])\n",
    "\n",
    "new_df = combined_df.sample(frac=1).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "training_df = new_df.drop('username', axis=1)[:int(combined_df.shape[0] * 0.8)]\n",
    "test_df = new_df.drop('username', axis=1)[int(combined_df.shape[0] * 0.8):]\n",
    "\n",
    "columns_to_standardize = ['age', 'total_tweets', 'total_following', 'total_followers', 'total_likes']\n",
    "\n",
    "training_df_mean = training_df[columns_to_standardize].mean()\n",
    "training_df_std = training_df[columns_to_standardize].std()\n",
    "\n",
    "training_df[columns_to_standardize] = (training_df[columns_to_standardize] - training_df_mean)/training_df_std\n",
    "test_df[columns_to_standardize] = (test_df[columns_to_standardize] - training_df_mean)/training_df_std\n",
    "\n",
    "# training_df_mean = training_df.mean()\n",
    "# training_df_std = training_df.std()\n",
    "\n",
    "# training_df = (training_df - training_df_mean)/training_df_std\n",
    "# test_df = (test_df - training_df_mean)/training_df_std\n",
    "\n",
    "# max_vals = training_df.max()\n",
    "\n",
    "# training_df = training_df/max_vals\n",
    "# test_df = test_df/max_vals"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train = training_df.drop(['BotOrNot', 'is_protected'], axis=1).values\n",
    "y_train = training_df['BotOrNot'].values.reshape(-1,1)\n",
    "\n",
    "X_test = test_df.drop(['BotOrNot', 'is_protected'], axis=1).values\n",
    "y_test = test_df['BotOrNot'].values.reshape(-1,1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "s = SMOTE()\n",
    "smote_X, smote_y = s.fit_resample(X_train, y_train.reshape(-1))\n",
    "\n",
    "e = EditedNearestNeighbours()\n",
    "r_X, r_y = e.fit_resample(smote_X, smote_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "a = AdaBoostClassifier(n_estimators=150, random_state=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,\n",
       "          learning_rate=1.0, n_estimators=150, random_state=0)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a.fit(r_X, r_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_predict = a.predict(r_X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Accuracy: 1.0 \n",
      "\n",
      "True Positive: 3778 \n",
      "True Negative: 3906\n",
      "False Positive: 0 \n",
      "False Negative: 0\n",
      "\n",
      "Precision: 1.0 \n",
      "Recall: 1.0 \n",
      "F1 Score: 1.0\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "1.0"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_accuracy(y_predict, r_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_test_predict = a.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Accuracy: 0.9845053635280095 \n",
      "\n",
      "True Positive: 984 \n",
      "True Negative: 668\n",
      "False Positive: 4 \n",
      "False Negative: 22\n",
      "\n",
      "Precision: 0.9959514170040485 \n",
      "Recall: 0.9781312127236581 \n",
      "F1 Score: 0.9869608826479438\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.9845053635280095"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_accuracy(y_test_predict, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda env:anaconda3]",
   "language": "python",
   "name": "conda-env-anaconda3-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
